// Loads a .bin file into a BeagleBone PRU and then interacts with it
// in shared PRU memory and (system-wide) DDR memory.
//
// Pass in the filename of the .bin file on the command line, eg:
// $ ./pru_loader foo.bin
//
// Compile with:
// gcc -std=gnu99 -o pru_loader pru_loader.c -lprussdrv
#include <unistd.h>
#include <stdio.h>
#include <inttypes.h>
#include <prussdrv.h>
#include <pruss_intc_mapping.h>
int main(int argc, char **argv) {
if (argc != 2) {
printf("Usage: %s pru_code.bin\n", argv[0]);
return 1;
}
// If this segfaults, make sure you're executing as root.
prussdrv_init();
if (prussdrv_open(PRU_EVTOUT_0) == -1) {
printf("prussdrv_open() failed\n");
return 1;
}
tpruss_intc_initdata pruss_intc_initdata = PRUSS_INTC_INITDATA;
prussdrv_pruintc_init(&pruss_intc_initdata);
// Pointer into the 8KB of shared PRU DRAM
volatile void *shared_memory_void = NULL;
// Useful if we're storing data there in 4-byte chunks
volatile uint32_t *shared_memory = NULL;
prussdrv_map_prumem(PRUSS0_SHARED_DATARAM, (void **) &shared_memory_void);
shared_memory = (uint32_t *) shared_memory_void;
// Pointer into the DDR RAM mapped by the uio_pruss kernel module.
volatile void *shared_ddr = NULL;
prussdrv_map_extmem((void **) &shared_ddr);
unsigned int shared_ddr_len = prussdrv_extmem_size();
unsigned int physical_address = prussdrv_get_phys_addr((void *) shared_ddr);
printf("%u bytes of shared DDR available.\n Physical (PRU-side) address:%x\n",
shared_ddr_len, physical_address);
printf("Virtual (linux-side) address: %p\n\n", shared_ddr);
// We'll use the first 8 bytes of PRU memory to tell it where the
// shared segment of system memory is.
shared_memory[0] = physical_address;
shared_memory[1] = shared_ddr_len;
// Change to 0 to use PRU0
int which_pru = 1;
prussdrv_exec_program(which_pru, argv[1]);
for (int i = 0; i < 10; i++) {
sleep(1);
// See if it's successfully writing the physical address of each word at
// the (virtual, from our viewpoint) address
printf("DDR[%d] is: %p / 0x%x\n", i, ((unsigned int *)shared_ddr) + i,
((unsigned int *) shared_ddr)[i]);
int passes = shared_memory[0];
int bytes_written = passes * shared_ddr_len;
printf("Bytes written: %d\n", bytes_written);
}
// Wait for the PRU to let us know it's done
prussdrv_pru_wait_event(PRU_EVTOUT_0);
printf("All done\n");
prussdrv_pru_disable(which_pru);
prussdrv_exit();
return 0;
}
And here's the assembly:
.origin 0
.entrypoint TOP
#define DDR r29
#define DDR_SIZE r28
#define SHARED_RAM r27
#define SHARED_RAM_ADDRESS 0x10000
TOP:
// Enable OCP master ports in SYSCFG register
LBCO r0, C4, 4, 4
CLR r0, r0, 4
SBCO r0, C4, 4, 4
MOV SHARED_RAM, SHARED_RAM_ADDRESS
// From shared RAM, grab the address of the shared DDR segment
LBBO DDR, SHARED_RAM, 0, 4
// And the size of the segment from SHARED_RAM + 4
LBBO DDR_SIZE, SHARED_RAM, 4, 4
// BIGLOOP is one pass overwriting the shared DDR memory segment
mov r12, 0
mov r14, 10000
BIGLOOP:
// Start at the beginning of the segment
MOV r10, DDR
ADD r11, DDR, DDR_SIZE
// Tight loop writing the physical address of each word into that word
LOOP0:
SBBO r10, r10, 0, 4
ADD r10, r10, 4
// XXX: This means r10 < r11, opposite what I expected!
QBLT LOOP0, r11, r10
ADD r12, r12, 1
SBBO r12, SHARED_RAM, 0, 4
QBGT BIGLOOP, r12, r14
// Interrupt the host so it knows we're done
MOV r31.b0, 19 + 16
// Don't forget to halt!
HALT
Here's the output I get, about 200MB/sec:
262144 bytes of shared DDR available.
Physical (PRU-side) address:9e6c0000
Virtual (linux-side) address: 0xb6d78000
DDR[0] is: 0xb6d78000 / 0x9e6c0000
Bytes written: 200540160
DDR[1] is: 0xb6d78004 / 0x9e6c0004
Bytes written: 401342464
DDR[2] is: 0xb6d78008 / 0x9e6c0008
Bytes written: 601882624
DDR[3] is: 0xb6d7800c / 0x9e6c000c
Bytes written: 802160640
DDR[4] is: 0xb6d78010 / 0x9e6c0010
Bytes written: 1002176512
DDR[5] is: 0xb6d78014 / 0x9e6c0014
Bytes written: 1202454528
DDR[6] is: 0xb6d78018 / 0x9e6c0018
Bytes written: 1402470400
DDR[7] is: 0xb6d7801c / 0x9e6c001c
Bytes written: 1602748416
DDR[8] is: 0xb6d78020 / 0x9e6c0020
Bytes written: 1802764288
DDR[9] is: 0xb6d78024 / 0x9e6c0024
Bytes written: 2003042304
All done
If I crank up the number of bytes written by SBBO from 4 to 8 (in the SBBO and ADD after LOOP0), then I think it ends up writing the contents of r10 and r11 into memory, and I get 320MB/sec. If I crank it up to 16 bytes per write, I get 450MB/sec.
So the PRU really can write very quickly to system RAM.
7 comments:
BTW, would it be OK to apply a GPL license to your two code examples here (the c and assembly) so it can be used in other projects? (Or GPLv2, or whatever open source license you prefer)
I'm a new beaglebone user, and would like to include your example in a disk image that helps other folks that are new to BeagleBone Black getting their PRU's working. Just need your permission to GPL it; I'll include your reply in a README.txt file along with the license file and headers, and can point you at the result.
BTW, here's a Makefile that pulls it together; this worked for me on a BBB with Debian 8.4:
http://seriss.com/people/erco/beaglebone/pru-ddr-memory-access/Makefile
(I tried to paste the Makefile here, but blogger.com's comments don't support the necessary tags like PRE to post code correctly..)
PS. Sorry about my first comment with the non-existent bug; the argv[3] I'd mentioned was something I must have accidentally introduced while re-indenting the code in vi.
Sure, feel free to use per GPL 2 or public domain. Also note that the ARM architecture used in BBB has an inefficient way of doing DMA, which ends up as a bottleneck between writing to RAM from a PRU and reading from the CPU (or vice versa). I believe you still get several hundred megabytes per second at least, but it's much less than the raw rate at which the PRU can access memory.
Thanks! Will do the gpl2.
Re DMA latency, yes, I was reading this page last night, and noted well that caveat.
I think for my purposes slow-ish is OK; I plan to send stepper motor velocities to the PRU for it to directly drive the step signals for about 8 motors hopefully. The sample rate for the velocities is around 100Hz, so I need linux to send the 8 vels (16 bits/vel) at a rate a little faster than that. The pulse trains the PRUs will be sending to the steppers would be more like 10KHz or so.. and my timing loop probably needs to be more like 1 MHz to get smooth jitter free pulse trains. Will kinda figure that out empirically. It's nice to have some extra headroom, up to 200MHz if I need it.
Haven't started yet, just accumulating examples right now, and want to make a BBB disk image with all the examples easily accessible, so folks can just write the image to a micro SDHC and reboot the BBB to start working right away, no patches or updates. I already posted such an image with two examples for blinking an LED, but would like to include a few RAM I/O examples too; in my application, I kinda need both GPIO bit twiddling and RAM I/O to the host.
Thanks again! I'll forward you a link to what I end up with; I'll include GPLv2 headers, COPYING.txt, the Makefile, and a README.md based on your blog with links to it, before I package it up into a disk image.
OK, prepared it all for you to see here. If that looks ok or if you let have any changes you'd like me to add, let me know.
I wasn't sure what to put for the copyright notice other than your blogger name.
Hi unixguy,
Did you end up making that disk image you mentioned? I'm trying to figure out how to best use the PRU, and in particular how to transfer data (picture frames of ~60KB each) relatively fast to the ARM side in the best way.
Erik
Post a Comment