Aaron Williams | 4fd1e55 | 2021-04-23 19:56:32 +0200 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* |
| 3 | * Copyright (C) 2020 Marvell International Ltd. |
| 4 | * |
| 5 | * Support functions for managing command queues used for |
| 6 | * various hardware blocks. |
| 7 | * |
| 8 | * The common command queue infrastructure abstracts out the |
| 9 | * software necessary for adding to Octeon's chained queue |
| 10 | * structures. These structures are used for commands to the |
| 11 | * PKO, ZIP, DFA, RAID, HNA, and DMA engine blocks. Although each |
| 12 | * hardware unit takes commands and CSRs of different types, |
| 13 | * they all use basic linked command buffers to store the |
| 14 | * pending request. In general, users of the CVMX API don't |
| 15 | * call cvmx-cmd-queue functions directly. Instead the hardware |
| 16 | * unit specific wrapper should be used. The wrappers perform |
| 17 | * unit specific validation and CSR writes to submit the |
| 18 | * commands. |
| 19 | * |
| 20 | * Even though most software will never directly interact with |
| 21 | * cvmx-cmd-queue, knowledge of its internal workings can help |
| 22 | * in diagnosing performance problems and help with debugging. |
| 23 | * |
| 24 | * Command queue pointers are stored in a global named block |
| 25 | * called "cvmx_cmd_queues". Except for the PKO queues, each |
| 26 | * hardware queue is stored in its own cache line to reduce SMP |
| 27 | * contention on spin locks. The PKO queues are stored such that |
| 28 | * every 16th queue is next to each other in memory. This scheme |
| 29 | * allows for queues being in separate cache lines when there |
| 30 | * are low number of queues per port. With 16 queues per port, |
| 31 | * the first queue for each port is in the same cache area. The |
| 32 | * second queues for each port are in another area, etc. This |
| 33 | * allows software to implement very efficient lockless PKO with |
| 34 | * 16 queues per port using a minimum of cache lines per core. |
| 35 | * All queues for a given core will be isolated in the same |
| 36 | * cache area. |
| 37 | * |
| 38 | * In addition to the memory pointer layout, cvmx-cmd-queue |
| 39 | * provides an optimized fair ll/sc locking mechanism for the |
| 40 | * queues. The lock uses a "ticket / now serving" model to |
| 41 | * maintain fair order on contended locks. In addition, it uses |
| 42 | * predicted locking time to limit cache contention. When a core |
| 43 | * know it must wait in line for a lock, it spins on the |
| 44 | * internal cycle counter to completely eliminate any causes of |
| 45 | * bus traffic. |
| 46 | */ |
| 47 | |
| 48 | #ifndef __CVMX_CMD_QUEUE_H__ |
| 49 | #define __CVMX_CMD_QUEUE_H__ |
| 50 | |
| 51 | /** |
| 52 | * By default we disable the max depth support. Most programs |
| 53 | * don't use it and it slows down the command queue processing |
| 54 | * significantly. |
| 55 | */ |
| 56 | #ifndef CVMX_CMD_QUEUE_ENABLE_MAX_DEPTH |
| 57 | #define CVMX_CMD_QUEUE_ENABLE_MAX_DEPTH 0 |
| 58 | #endif |
| 59 | |
| 60 | /** |
| 61 | * Enumeration representing all hardware blocks that use command |
| 62 | * queues. Each hardware block has up to 65536 sub identifiers for |
| 63 | * multiple command queues. Not all chips support all hardware |
| 64 | * units. |
| 65 | */ |
| 66 | typedef enum { |
| 67 | CVMX_CMD_QUEUE_PKO_BASE = 0x00000, |
| 68 | #define CVMX_CMD_QUEUE_PKO(queue) \ |
| 69 | ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_PKO_BASE + (0xffff & (queue)))) |
| 70 | CVMX_CMD_QUEUE_ZIP = 0x10000, |
| 71 | #define CVMX_CMD_QUEUE_ZIP_QUE(queue) \ |
| 72 | ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_ZIP + (0xffff & (queue)))) |
| 73 | CVMX_CMD_QUEUE_DFA = 0x20000, |
| 74 | CVMX_CMD_QUEUE_RAID = 0x30000, |
| 75 | CVMX_CMD_QUEUE_DMA_BASE = 0x40000, |
| 76 | #define CVMX_CMD_QUEUE_DMA(queue) \ |
| 77 | ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_DMA_BASE + (0xffff & (queue)))) |
| 78 | CVMX_CMD_QUEUE_BCH = 0x50000, |
| 79 | #define CVMX_CMD_QUEUE_BCH(queue) ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_BCH + (0xffff & (queue)))) |
| 80 | CVMX_CMD_QUEUE_HNA = 0x60000, |
| 81 | CVMX_CMD_QUEUE_END = 0x70000, |
| 82 | } cvmx_cmd_queue_id_t; |
| 83 | |
| 84 | #define CVMX_CMD_QUEUE_ZIP3_QUE(node, queue) \ |
| 85 | ((cvmx_cmd_queue_id_t)((node) << 24 | CVMX_CMD_QUEUE_ZIP | (0xffff & (queue)))) |
| 86 | |
| 87 | /** |
| 88 | * Command write operations can fail if the command queue needs |
| 89 | * a new buffer and the associated FPA pool is empty. It can also |
| 90 | * fail if the number of queued command words reaches the maximum |
| 91 | * set at initialization. |
| 92 | */ |
| 93 | typedef enum { |
| 94 | CVMX_CMD_QUEUE_SUCCESS = 0, |
| 95 | CVMX_CMD_QUEUE_NO_MEMORY = -1, |
| 96 | CVMX_CMD_QUEUE_FULL = -2, |
| 97 | CVMX_CMD_QUEUE_INVALID_PARAM = -3, |
| 98 | CVMX_CMD_QUEUE_ALREADY_SETUP = -4, |
| 99 | } cvmx_cmd_queue_result_t; |
| 100 | |
| 101 | typedef struct { |
| 102 | /* First 64-bit word: */ |
| 103 | u64 fpa_pool : 16; |
| 104 | u64 base_paddr : 48; |
| 105 | s32 index; |
| 106 | u16 max_depth; |
| 107 | u16 pool_size_m1; |
| 108 | } __cvmx_cmd_queue_state_t; |
| 109 | |
| 110 | /** |
| 111 | * command-queue locking uses a fair ticket spinlock algo, |
| 112 | * with 64-bit tickets for endianness-neutrality and |
| 113 | * counter overflow protection. |
| 114 | * Lock is free when both counters are of equal value. |
| 115 | */ |
| 116 | typedef struct { |
| 117 | u64 ticket; |
| 118 | u64 now_serving; |
| 119 | } __cvmx_cmd_queue_lock_t; |
| 120 | |
| 121 | /** |
| 122 | * @INTERNAL |
| 123 | * This structure contains the global state of all command queues. |
| 124 | * It is stored in a bootmem named block and shared by all |
| 125 | * applications running on Octeon. Tickets are stored in a different |
| 126 | * cache line that queue information to reduce the contention on the |
| 127 | * ll/sc used to get a ticket. If this is not the case, the update |
| 128 | * of queue state causes the ll/sc to fail quite often. |
| 129 | */ |
| 130 | typedef struct { |
| 131 | __cvmx_cmd_queue_lock_t lock[(CVMX_CMD_QUEUE_END >> 16) * 256]; |
| 132 | __cvmx_cmd_queue_state_t state[(CVMX_CMD_QUEUE_END >> 16) * 256]; |
| 133 | } __cvmx_cmd_queue_all_state_t; |
| 134 | |
| 135 | extern __cvmx_cmd_queue_all_state_t *__cvmx_cmd_queue_state_ptrs[CVMX_MAX_NODES]; |
| 136 | |
| 137 | /** |
| 138 | * @INTERNAL |
| 139 | * Internal function to handle the corner cases |
| 140 | * of adding command words to a queue when the current |
| 141 | * block is getting full. |
| 142 | */ |
| 143 | cvmx_cmd_queue_result_t __cvmx_cmd_queue_write_raw(cvmx_cmd_queue_id_t queue_id, |
| 144 | __cvmx_cmd_queue_state_t *qptr, int cmd_count, |
| 145 | const u64 *cmds); |
| 146 | |
| 147 | /** |
| 148 | * Initialize a command queue for use. The initial FPA buffer is |
| 149 | * allocated and the hardware unit is configured to point to the |
| 150 | * new command queue. |
| 151 | * |
| 152 | * @param queue_id Hardware command queue to initialize. |
| 153 | * @param max_depth Maximum outstanding commands that can be queued. |
| 154 | * @param fpa_pool FPA pool the command queues should come from. |
| 155 | * @param pool_size Size of each buffer in the FPA pool (bytes) |
| 156 | * |
| 157 | * @return CVMX_CMD_QUEUE_SUCCESS or a failure code |
| 158 | */ |
| 159 | cvmx_cmd_queue_result_t cvmx_cmd_queue_initialize(cvmx_cmd_queue_id_t queue_id, int max_depth, |
| 160 | int fpa_pool, int pool_size); |
| 161 | |
| 162 | /** |
| 163 | * Shutdown a queue a free it's command buffers to the FPA. The |
| 164 | * hardware connected to the queue must be stopped before this |
| 165 | * function is called. |
| 166 | * |
| 167 | * @param queue_id Queue to shutdown |
| 168 | * |
| 169 | * @return CVMX_CMD_QUEUE_SUCCESS or a failure code |
| 170 | */ |
| 171 | cvmx_cmd_queue_result_t cvmx_cmd_queue_shutdown(cvmx_cmd_queue_id_t queue_id); |
| 172 | |
| 173 | /** |
| 174 | * Return the number of command words pending in the queue. This |
| 175 | * function may be relatively slow for some hardware units. |
| 176 | * |
| 177 | * @param queue_id Hardware command queue to query |
| 178 | * |
| 179 | * @return Number of outstanding commands |
| 180 | */ |
| 181 | int cvmx_cmd_queue_length(cvmx_cmd_queue_id_t queue_id); |
| 182 | |
| 183 | /** |
| 184 | * Return the command buffer to be written to. The purpose of this |
| 185 | * function is to allow CVMX routine access to the low level buffer |
| 186 | * for initial hardware setup. User applications should not call this |
| 187 | * function directly. |
| 188 | * |
| 189 | * @param queue_id Command queue to query |
| 190 | * |
| 191 | * @return Command buffer or NULL on failure |
| 192 | */ |
| 193 | void *cvmx_cmd_queue_buffer(cvmx_cmd_queue_id_t queue_id); |
| 194 | |
| 195 | /** |
| 196 | * @INTERNAL |
| 197 | * Retrieve or allocate command queue state named block |
| 198 | */ |
| 199 | cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(unsigned int node); |
| 200 | |
| 201 | /** |
| 202 | * @INTERNAL |
| 203 | * Get the index into the state arrays for the supplied queue id. |
| 204 | * |
| 205 | * @param queue_id Queue ID to get an index for |
| 206 | * |
| 207 | * @return Index into the state arrays |
| 208 | */ |
| 209 | static inline unsigned int __cvmx_cmd_queue_get_index(cvmx_cmd_queue_id_t queue_id) |
| 210 | { |
| 211 | /* Warning: This code currently only works with devices that have 256 |
| 212 | * queues or less. Devices with more than 16 queues are laid out in |
| 213 | * memory to allow cores quick access to every 16th queue. This reduces |
| 214 | * cache thrashing when you are running 16 queues per port to support |
| 215 | * lockless operation |
| 216 | */ |
| 217 | unsigned int unit = (queue_id >> 16) & 0xff; |
| 218 | unsigned int q = (queue_id >> 4) & 0xf; |
| 219 | unsigned int core = queue_id & 0xf; |
| 220 | |
| 221 | return (unit << 8) | (core << 4) | q; |
| 222 | } |
| 223 | |
| 224 | static inline int __cvmx_cmd_queue_get_node(cvmx_cmd_queue_id_t queue_id) |
| 225 | { |
| 226 | unsigned int node = queue_id >> 24; |
| 227 | return node; |
| 228 | } |
| 229 | |
| 230 | /** |
| 231 | * @INTERNAL |
| 232 | * Lock the supplied queue so nobody else is updating it at the same |
| 233 | * time as us. |
| 234 | * |
| 235 | * @param queue_id Queue ID to lock |
| 236 | * |
| 237 | */ |
| 238 | static inline void __cvmx_cmd_queue_lock(cvmx_cmd_queue_id_t queue_id) |
| 239 | { |
| 240 | } |
| 241 | |
| 242 | /** |
| 243 | * @INTERNAL |
| 244 | * Unlock the queue, flushing all writes. |
| 245 | * |
| 246 | * @param queue_id Queue ID to lock |
| 247 | * |
| 248 | */ |
| 249 | static inline void __cvmx_cmd_queue_unlock(cvmx_cmd_queue_id_t queue_id) |
| 250 | { |
| 251 | CVMX_SYNCWS; /* nudge out the unlock. */ |
| 252 | } |
| 253 | |
| 254 | /** |
| 255 | * @INTERNAL |
| 256 | * Initialize a command-queue lock to "unlocked" state. |
| 257 | */ |
| 258 | static inline void __cvmx_cmd_queue_lock_init(cvmx_cmd_queue_id_t queue_id) |
| 259 | { |
| 260 | unsigned int index = __cvmx_cmd_queue_get_index(queue_id); |
| 261 | unsigned int node = __cvmx_cmd_queue_get_node(queue_id); |
| 262 | |
| 263 | __cvmx_cmd_queue_state_ptrs[node]->lock[index] = (__cvmx_cmd_queue_lock_t){ 0, 0 }; |
| 264 | CVMX_SYNCWS; |
| 265 | } |
| 266 | |
| 267 | /** |
| 268 | * @INTERNAL |
| 269 | * Get the queue state structure for the given queue id |
| 270 | * |
| 271 | * @param queue_id Queue id to get |
| 272 | * |
| 273 | * @return Queue structure or NULL on failure |
| 274 | */ |
| 275 | static inline __cvmx_cmd_queue_state_t *__cvmx_cmd_queue_get_state(cvmx_cmd_queue_id_t queue_id) |
| 276 | { |
| 277 | unsigned int index; |
| 278 | unsigned int node; |
| 279 | __cvmx_cmd_queue_state_t *qptr; |
| 280 | |
| 281 | node = __cvmx_cmd_queue_get_node(queue_id); |
| 282 | index = __cvmx_cmd_queue_get_index(queue_id); |
| 283 | |
| 284 | if (cvmx_unlikely(!__cvmx_cmd_queue_state_ptrs[node])) |
| 285 | __cvmx_cmd_queue_init_state_ptr(node); |
| 286 | |
| 287 | qptr = &__cvmx_cmd_queue_state_ptrs[node]->state[index]; |
| 288 | return qptr; |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * Write an arbitrary number of command words to a command queue. |
| 293 | * This is a generic function; the fixed number of command word |
| 294 | * functions yield higher performance. |
| 295 | * |
| 296 | * @param queue_id Hardware command queue to write to |
| 297 | * @param use_locking |
| 298 | * Use internal locking to ensure exclusive access for queue |
| 299 | * updates. If you don't use this locking you must ensure |
| 300 | * exclusivity some other way. Locking is strongly recommended. |
| 301 | * @param cmd_count Number of command words to write |
| 302 | * @param cmds Array of commands to write |
| 303 | * |
| 304 | * @return CVMX_CMD_QUEUE_SUCCESS or a failure code |
| 305 | */ |
| 306 | static inline cvmx_cmd_queue_result_t |
| 307 | cvmx_cmd_queue_write(cvmx_cmd_queue_id_t queue_id, bool use_locking, int cmd_count, const u64 *cmds) |
| 308 | { |
| 309 | cvmx_cmd_queue_result_t ret = CVMX_CMD_QUEUE_SUCCESS; |
| 310 | u64 *cmd_ptr; |
| 311 | |
| 312 | __cvmx_cmd_queue_state_t *qptr = __cvmx_cmd_queue_get_state(queue_id); |
| 313 | |
| 314 | /* Make sure nobody else is updating the same queue */ |
| 315 | if (cvmx_likely(use_locking)) |
| 316 | __cvmx_cmd_queue_lock(queue_id); |
| 317 | |
| 318 | /* Most of the time there is lots of free words in current block */ |
| 319 | if (cvmx_unlikely((qptr->index + cmd_count) >= qptr->pool_size_m1)) { |
| 320 | /* The rare case when nearing end of block */ |
| 321 | ret = __cvmx_cmd_queue_write_raw(queue_id, qptr, cmd_count, cmds); |
| 322 | } else { |
| 323 | cmd_ptr = (u64 *)cvmx_phys_to_ptr((u64)qptr->base_paddr); |
| 324 | /* Loop easy for compiler to unroll for the likely case */ |
| 325 | while (cmd_count > 0) { |
| 326 | cmd_ptr[qptr->index++] = *cmds++; |
| 327 | cmd_count--; |
| 328 | } |
| 329 | } |
| 330 | |
| 331 | /* All updates are complete. Release the lock and return */ |
| 332 | if (cvmx_likely(use_locking)) |
| 333 | __cvmx_cmd_queue_unlock(queue_id); |
| 334 | else |
| 335 | CVMX_SYNCWS; |
| 336 | |
| 337 | return ret; |
| 338 | } |
| 339 | |
| 340 | /** |
| 341 | * Simple function to write two command words to a command queue. |
| 342 | * |
| 343 | * @param queue_id Hardware command queue to write to |
| 344 | * @param use_locking |
| 345 | * Use internal locking to ensure exclusive access for queue |
| 346 | * updates. If you don't use this locking you must ensure |
| 347 | * exclusivity some other way. Locking is strongly recommended. |
| 348 | * @param cmd1 Command |
| 349 | * @param cmd2 Command |
| 350 | * |
| 351 | * @return CVMX_CMD_QUEUE_SUCCESS or a failure code |
| 352 | */ |
| 353 | static inline cvmx_cmd_queue_result_t cvmx_cmd_queue_write2(cvmx_cmd_queue_id_t queue_id, |
| 354 | bool use_locking, u64 cmd1, u64 cmd2) |
| 355 | { |
| 356 | cvmx_cmd_queue_result_t ret = CVMX_CMD_QUEUE_SUCCESS; |
| 357 | u64 *cmd_ptr; |
| 358 | |
| 359 | __cvmx_cmd_queue_state_t *qptr = __cvmx_cmd_queue_get_state(queue_id); |
| 360 | |
| 361 | /* Make sure nobody else is updating the same queue */ |
| 362 | if (cvmx_likely(use_locking)) |
| 363 | __cvmx_cmd_queue_lock(queue_id); |
| 364 | |
| 365 | if (cvmx_unlikely((qptr->index + 2) >= qptr->pool_size_m1)) { |
| 366 | /* The rare case when nearing end of block */ |
| 367 | u64 cmds[2]; |
| 368 | |
| 369 | cmds[0] = cmd1; |
| 370 | cmds[1] = cmd2; |
| 371 | ret = __cvmx_cmd_queue_write_raw(queue_id, qptr, 2, cmds); |
| 372 | } else { |
| 373 | /* Likely case to work fast */ |
| 374 | cmd_ptr = (u64 *)cvmx_phys_to_ptr((u64)qptr->base_paddr); |
| 375 | cmd_ptr += qptr->index; |
| 376 | qptr->index += 2; |
| 377 | cmd_ptr[0] = cmd1; |
| 378 | cmd_ptr[1] = cmd2; |
| 379 | } |
| 380 | |
| 381 | /* All updates are complete. Release the lock and return */ |
| 382 | if (cvmx_likely(use_locking)) |
| 383 | __cvmx_cmd_queue_unlock(queue_id); |
| 384 | else |
| 385 | CVMX_SYNCWS; |
| 386 | |
| 387 | return ret; |
| 388 | } |
| 389 | |
| 390 | /** |
| 391 | * Simple function to write three command words to a command queue. |
| 392 | * |
| 393 | * @param queue_id Hardware command queue to write to |
| 394 | * @param use_locking |
| 395 | * Use internal locking to ensure exclusive access for queue |
| 396 | * updates. If you don't use this locking you must ensure |
| 397 | * exclusivity some other way. Locking is strongly recommended. |
| 398 | * @param cmd1 Command |
| 399 | * @param cmd2 Command |
| 400 | * @param cmd3 Command |
| 401 | * |
| 402 | * @return CVMX_CMD_QUEUE_SUCCESS or a failure code |
| 403 | */ |
| 404 | static inline cvmx_cmd_queue_result_t |
| 405 | cvmx_cmd_queue_write3(cvmx_cmd_queue_id_t queue_id, bool use_locking, u64 cmd1, u64 cmd2, u64 cmd3) |
| 406 | { |
| 407 | cvmx_cmd_queue_result_t ret = CVMX_CMD_QUEUE_SUCCESS; |
| 408 | __cvmx_cmd_queue_state_t *qptr = __cvmx_cmd_queue_get_state(queue_id); |
| 409 | u64 *cmd_ptr; |
| 410 | |
| 411 | /* Make sure nobody else is updating the same queue */ |
| 412 | if (cvmx_likely(use_locking)) |
| 413 | __cvmx_cmd_queue_lock(queue_id); |
| 414 | |
| 415 | if (cvmx_unlikely((qptr->index + 3) >= qptr->pool_size_m1)) { |
| 416 | /* Most of the time there is lots of free words in current block */ |
| 417 | u64 cmds[3]; |
| 418 | |
| 419 | cmds[0] = cmd1; |
| 420 | cmds[1] = cmd2; |
| 421 | cmds[2] = cmd3; |
| 422 | ret = __cvmx_cmd_queue_write_raw(queue_id, qptr, 3, cmds); |
| 423 | } else { |
| 424 | cmd_ptr = (u64 *)cvmx_phys_to_ptr((u64)qptr->base_paddr); |
| 425 | cmd_ptr += qptr->index; |
| 426 | qptr->index += 3; |
| 427 | cmd_ptr[0] = cmd1; |
| 428 | cmd_ptr[1] = cmd2; |
| 429 | cmd_ptr[2] = cmd3; |
| 430 | } |
| 431 | |
| 432 | /* All updates are complete. Release the lock and return */ |
| 433 | if (cvmx_likely(use_locking)) |
| 434 | __cvmx_cmd_queue_unlock(queue_id); |
| 435 | else |
| 436 | CVMX_SYNCWS; |
| 437 | |
| 438 | return ret; |
| 439 | } |
| 440 | |
| 441 | #endif /* __CVMX_CMD_QUEUE_H__ */ |